setwd('/Users/emmatimminsschiffman/Documents/Dissertation/proteomics/DB post-genome/combined OT and QE')

spec.dat<-read.csv('spec counts OT and QE.csv', header=T, row.names=1)
#2504 proteins total

treatment<-read.csv('treatment groupings.csv',header=T, row.names=1)

#remove proteins that are expressed in fewer than half the oysters
spec.dat1<-drop.var(spec.dat, min.fo=8)
str(spec.dat1)
#2164 proteins remaining
#remove proteins that are super abundant
spec.dat2<-drop.var(spec.dat, max.po=0.95)
str(spec.dat2)
#no proteins remaining
#drop proteins with little variation
spec.dat3<-drop.var(spec.dat, min.cv=5)
str(spec.dat3)
#all proteins remaining

#data transformation log(x+1)
specdat.tra<-(spec.dat+1)
specdat.tra<-data.trans(specdat.tra, method='log', plot=F)

specdat.tra1<-(spec.dat1+1)
specdat.tra1<-data.trans(specdat.tra1, method='log', plot=F)

#NMDS
specdat.nmds<-metaMDS(specdat.tra, distance='bray', k=2, trymax=100, autotransform=F)

vec.specdat<-envfit(specdat.nmds$points, specdat.tra, perm=100)

ordiplot(specdat.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
plot(vec.specdat, p.max=0.01, col='blue', cex=0.5)
ordihull(specdat.nmds, treatment[,3], label=T, col='magenta')

specdat.nmds1<-metaMDS(specdat.tra1, distance='bray', k=2, trymax=100, autotransform=F)
vec.specdat1<-envfit(specdat.nmds1$points, specdat.tra1, perm=100)
ordiplot(specdat.nmds1, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
plot(vec.specdat1, p.max=0.01, col='blue', cex=0.5)
ordihull(specdat.nmds1, treatment[,3], label=T, col='magenta')
#looks really similar to NMDS with all proteins

#ANOSIM
spec.row<-data.stand(spec.dat, method='total', margin='row', plot=F)
spec.d<-vegdist(spec.row, 'bray')

#anosim for pCO2 only
spec.anosim1<-anosim(spec.d, grouping=treatment[,1])
summary(spec.anosim1)
#R=-0.01786, p=0.571
#MS only
spec.anosim2<-anosim(spec.d, grouping=treatment[,2])
summary(spec.anosim2)
#R=-0.005022, p=0.478
#all treatments
spec.anosim3<-anosim(spec.d, grouping=treatment[,3])
summary(spec.anosim3)
#R=-0.002604, p=0.488

#GO Terms
go.dat<-read.csv('spec counts GO OT and QE.csv', header=T, row.names=1)
godat.tra<-(go.dat+1)
godat.tra<-data.trans(godat.tra,method='log', plot=F)

godat.nmds<-metaMDS(godat.tra, distance='bray', k=2, trymax=100, autotransform=F)
vec.godat<-envfit(godat.nmds$points, godat.tra, perm=100)

ordiplot(godat.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
plot(vec.godat, p.max=0.01, col='blue', cex=0.5)
ordihull(godat.nmds, treatment[,3], label=T, col='magenta')

go.row<-data.stand(go.dat, method='total', margin='row', plot=F)
go.d<-vegdist(go.row, 'bray')

#pco2 only
go.anosim1<-anosim(go.d, grouping=treatment[,1])
summary(go.anosim1)
#R=0.1094, p=0.115

#MS only
go.anosim2<-anosim(go.d, grouping=treatment[,2])
summary(go.anosim2)
#R=0.02734, p=0.348

#all treatments
go.anosim3<-anosim(go.d, grouping=treatment[,3])
summary(go.anosim3)
#R=0.1623, p=0.067

#Histograms of number of peptides sequenced per protein
OT.spec<-read.csv('total OT spec counts.csv', header=T, row.names=1)
hist(OT.spec$Total, breaks=100, xlab='Peptides Sequenced per Protein', ylab='Number of Proteins', main='Orbitrap', ylim=c(0,500))

QE.spec<-read.csv('total QE spec counts.csv', header=T, row.names=1)
hist(QE.spec$Total, breaks=200, xlab='Peptides Sequenced per Protein', ylab='Number of Proteins', main='Q Exactive', ylim=c(0,3000))

#what does it look like when we get rid of all proteins that only have 5 or fewer peptide hits?
QE.spec2<-read.csv('total QE spec counts abridged.csv', header=T, row.names=1)
hist(QE.spec2$Total, breaks=100, xlab='Peptides Sequenced per Protein', ylab='Number of Proteins', main='Q Exactive')